{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "fb07d191-7cc1-43e8-a9b0-ba6bf46a4246",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset: BBC news\n",
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "04f7ecf3-9842-40ef-9964-21b11fea4622",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from nltk import word_tokenize\n",
    "from math import ceil\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from nltk.corpus import stopwords\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c850d828-a899-4c4b-97f2-2b18d9c66989",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Repo card metadata block was not found. Setting CardData to empty.\n",
      "Repo card metadata block was not found. Setting CardData to empty.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label     label_text\n",
      "0     wales want rugby league training wales could f...      2          sport\n",
      "1     china aviation seeks rescue deal scandal-hit j...      1       business\n",
      "2     rock band u2 break ticket record u2 have smash...      3  entertainment\n",
      "3     markets signal brazilian recovery the brazilia...      1       business\n",
      "4     tough rules for ringtone sellers firms that fl...      0           tech\n",
      "...                                                 ...    ...            ...\n",
      "1220  us economy shows solid gdp growth the us econo...      1       business\n",
      "1221  microsoft releases bumper patches microsoft ha...      0           tech\n",
      "1222  stuart joins norwich from addicks norwich have...      2          sport\n",
      "1223  why few targets are better than many the econo...      1       business\n",
      "1224  boothroyd calls for lords speaker betty boothr...      4       politics\n",
      "\n",
      "[1225 rows x 3 columns]\n",
      "                                                  text  label     label_text\n",
      "0    carry on star patsy rowlands dies actress pats...      3  entertainment\n",
      "1    sydney to host north v south game sydney will ...      2          sport\n",
      "2    uk coal plunges into deeper loss shares in uk ...      1       business\n",
      "3    blair joins school sailing trip the prime mini...      4       politics\n",
      "4    bath faced with tindall ultimatum mike tindall...      2          sport\n",
      "..                                                 ...    ...            ...\n",
      "995  mobile multimedia slow to catch on there is no...      0           tech\n",
      "996  owen determined to stay in madrid england forw...      2          sport\n",
      "997  mobile tv tipped as one to watch scandinavians...      0           tech\n",
      "998  stormy year for property insurers a string of ...      1       business\n",
      "999  what the election should really be about  a ge...      4       politics\n",
      "\n",
      "[1000 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "train_dataset = load_dataset(\"SetFit/bbc-news\", split=\"train\")\n",
    "test_dataset = load_dataset(\"SetFit/bbc-news\", split=\"test\")\n",
    "train_df = train_dataset.to_pandas()\n",
    "test_df = test_dataset.to_pandas()\n",
    "print(train_df)\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c82a16c4-fbd3-4e67-b433-bc5de33e856a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                document  \\\n",
      "0      National Archives \\n \\n Yes, it’s that time ag...   \n",
      "1      LOS ANGELES (AP) — In her first interview sinc...   \n",
      "2      GAITHERSBURG, Md. (AP) — A small, private jet ...   \n",
      "3      Tucker Carlson Exposes His Own Sexism on Twitt...   \n",
      "4      A man accused of removing another man's testic...   \n",
      "...                                                  ...   \n",
      "44967  More than 670,000 copies of the Pearls’ self-p...   \n",
      "44968  Seeking out cost-conscious consumers who have ...   \n",
      "44969  Click to email this to a friend (Opens in new ...   \n",
      "44970  BARRINGTON, R.I. (AP) — Women clad in yoga pan...   \n",
      "44971  Based on a ‘real’ story, the hit John Travolta...   \n",
      "\n",
      "                                                 summary  \n",
      "0      – The unemployment rate dropped to 8.2% last m...  \n",
      "1      – Shelly Sterling plans \"eventually\" to divorc...  \n",
      "2      – A twin-engine Embraer jet that the FAA descr...  \n",
      "3      – Tucker Carlson is in deep doodoo with conser...  \n",
      "4      – What are the three most horrifying words in ...  \n",
      "...                                                  ...  \n",
      "44967  – The deaths of three children have been linke...  \n",
      "44968  – Apple is hoping its new, cheaper iPhone can ...  \n",
      "44969  – January Jones, who plays the beleaguered wif...  \n",
      "44970  – A Rhode Island man who penned a letter to th...  \n",
      "44971  – The Guardian revisits a fascinating piece of...  \n",
      "\n",
      "[44972 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "train_dataset = load_dataset(\"multi_news\", split=\"train\")\n",
    "test_dataset = load_dataset(\"multi_news\", split=\"test\")\n",
    "train_df = train_dataset.to_pandas()\n",
    "test_df = test_dataset.to_pandas()\n",
    "print(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a416fe4b-b954-498d-8514-d58d9f78efda",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-3 {color: black;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>TfidfVectorizer(max_df=0.95, min_df=2, stop_words=&#x27;english&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" checked><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">TfidfVectorizer</label><div class=\"sk-toggleable__content\"><pre>TfidfVectorizer(max_df=0.95, min_df=2, stop_words=&#x27;english&#x27;)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.95)\n",
    "vectorizer.fit(train_df[\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "55944bbd-4f91-4010-adda-7d23e117ff6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sort_data_tfidf_score(coord_matrix):\n",
    "    tuples = zip(coord_matrix.col, coord_matrix.data)\n",
    "    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "120119c2-9b22-4d2b-9bf6-1ec22117f768",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_keyword_strings(vectorizer, num_words, sorted_vector):\n",
    "    words = []\n",
    "    index_dict = vectorizer.get_feature_names_out()\n",
    "    for (item_index, score) in sorted_vector[0:num_words]:\n",
    "        word = index_dict[item_index]\n",
    "        words.append(word)\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d94036a8-fa7f-4292-abd4-cefc149f6f78",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_keywords_simple(vectorizer, input_text, num_output_words=10):\n",
    "    vector = vectorizer.transform([input_text])\n",
    "    sorted = sort_data_tfidf_score(vector.tocoo())\n",
    "    words = get_keyword_strings(vectorizer, num_output_words, sorted)\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b03d9ad2-002d-40b3-8ce9-f09845981b33",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "carry on star patsy rowlands dies actress patsy rowlands  known to millions for her roles in the carry on films  has died at the age of 71.  rowlands starred in nine of the popular carry on films  alongside fellow regulars sid james  kenneth williams and barbara windsor. she also carved out a successful television career  appearing for many years in itv s well-loved comedy bless this house. rowlands died in hove on saturday morning  her agent said.  born in january 1934  rowlands won a scholarship to the guildhall school of speech and drama scholarship when she was just 15.  after spending several years at the players theatre in london  she made her film debut in 1963 in tom jones  directed by tony richardson. she made her first carry on film in 1969 where she appeared in carry on again doctor. rowlands played the hard-done-by wife or the put-upon employee as a regular carry on star. she also appeared in carry on at your convenience  carry on matron and carry on loving  as well as others.  in recent years she appeared in bbc mini-series the cazalets and played mrs potts in the london stage version of beauty and the beast. agent simon beresford said:  she was just an absolutely favourite client she never complained about anything  particularly when she was ill  she was an old trouper.  she was of the old school - she had skills from musical theatre and high drama  that is why she worked with the great and the good of directors.  she didn t mind always being recognised for the carry on films because she thoroughly enjoyed making them. she was a really lovely person and she will be much missed.  her last appearance on stage was as mrs pearce in the award-winning production of my fair lady at the national theatre. previously married  she leaves one son  alan. her funeral will be a private  family occasion  with a memorial service at a later date.\n",
      "['carry', 'theatre', 'scholarship', 'appeared', 'films', 'mrs', 'agent', 'drama', 'died', 'school']\n"
     ]
    }
   ],
   "source": [
    "print(test_df.iloc[0][\"text\"])\n",
    "keywords = get_keywords_simple(vectorizer, test_df.iloc[0][\"text\"])\n",
    "print(keywords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "91f7a135-3d11-489c-a5d1-38617449794d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-4 {color: black;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 3),\n",
       "                stop_words=[&#x27;i&#x27;, &#x27;me&#x27;, &#x27;my&#x27;, &#x27;myself&#x27;, &#x27;we&#x27;, &#x27;our&#x27;, &#x27;ours&#x27;,\n",
       "                            &#x27;ourselves&#x27;, &#x27;you&#x27;, &quot;you&#x27;re&quot;, &quot;you&#x27;ve&quot;, &quot;you&#x27;ll&quot;,\n",
       "                            &quot;you&#x27;d&quot;, &#x27;your&#x27;, &#x27;yours&#x27;, &#x27;yourself&#x27;, &#x27;yourselves&#x27;,\n",
       "                            &#x27;he&#x27;, &#x27;him&#x27;, &#x27;his&#x27;, &#x27;himself&#x27;, &#x27;she&#x27;, &quot;she&#x27;s&quot;,\n",
       "                            &#x27;her&#x27;, &#x27;hers&#x27;, &#x27;herself&#x27;, &#x27;it&#x27;, &quot;it&#x27;s&quot;, &#x27;its&#x27;,\n",
       "                            &#x27;itself&#x27;, ...])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" checked><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">TfidfVectorizer</label><div class=\"sk-toggleable__content\"><pre>TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 3),\n",
       "                stop_words=[&#x27;i&#x27;, &#x27;me&#x27;, &#x27;my&#x27;, &#x27;myself&#x27;, &#x27;we&#x27;, &#x27;our&#x27;, &#x27;ours&#x27;,\n",
       "                            &#x27;ourselves&#x27;, &#x27;you&#x27;, &quot;you&#x27;re&quot;, &quot;you&#x27;ve&quot;, &quot;you&#x27;ll&quot;,\n",
       "                            &quot;you&#x27;d&quot;, &#x27;your&#x27;, &#x27;yours&#x27;, &#x27;yourself&#x27;, &#x27;yourselves&#x27;,\n",
       "                            &#x27;he&#x27;, &#x27;him&#x27;, &#x27;his&#x27;, &#x27;himself&#x27;, &#x27;she&#x27;, &quot;she&#x27;s&quot;,\n",
       "                            &#x27;her&#x27;, &#x27;hers&#x27;, &#x27;herself&#x27;, &#x27;it&#x27;, &quot;it&#x27;s&quot;, &#x27;its&#x27;,\n",
       "                            &#x27;itself&#x27;, ...])</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 3),\n",
       "                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',\n",
       "                            'ourselves', 'you', \"you're\", \"you've\", \"you'll\",\n",
       "                            \"you'd\", 'your', 'yours', 'yourself', 'yourselves',\n",
       "                            'he', 'him', 'his', 'himself', 'she', \"she's\",\n",
       "                            'her', 'hers', 'herself', 'it', \"it's\", 'its',\n",
       "                            'itself', ...])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stop_words = list(stopwords.words('english'))\n",
    "stop_words.remove(\"the\")\n",
    "trigram_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1,3), max_df=0.95)\n",
    "trigram_vectorizer.fit(train_df[\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "757a23a6-4c14-492c-89b8-6765418182ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_keyword_strings_all(vectorizer, sorted_vector):\n",
    "    words = []\n",
    "    index_dict = vectorizer.get_feature_names_out()\n",
    "    for (item_index, score) in sorted_vector:\n",
    "        word = index_dict[item_index]\n",
    "        words.append(word)\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "900cb191-794e-42ba-8dea-2ca1948b3938",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['carry', 'theatre', 'scholarship', 'appeared', 'films', 'mrs', 'agent', 'drama', 'died', 'school']\n"
     ]
    }
   ],
   "source": [
    "keywords = get_keywords_simple(vectorizer, test_df.iloc[0][\"text\"])\n",
    "print(keywords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "bca9d71f-9a02-437c-8b91-d80765a24cf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_keywords_complex(vectorizer, input_text, spacy_model, num_words=70):\n",
    "    keywords = []\n",
    "    doc = spacy_model(input_text)\n",
    "    vector = vectorizer.transform([input_text])\n",
    "    sorted = sort_coo(vector.tocoo())\n",
    "    ngrams = get_keyword_strings_all(vectorizer, sorted)\n",
    "    ents = [ent.text.lower() for ent in doc.noun_chunks]\n",
    "    for i in range(0, num_words):\n",
    "        keyword = ngrams[i]\n",
    "        if keyword.lower() in ents and not keyword.isdigit() and keyword not in keywords:\n",
    "            keywords.append(keyword)\n",
    "    return keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "91305cd3-2bc2-47d5-9a22-e17a0d1fbc73",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['carry', 'films', 'stage', 'several years', 'saturday morning', 'star', 'film', 'london', 'beauty', 'the good', 'many years', 'directors']\n"
     ]
    }
   ],
   "source": [
    "keywords = get_keywords_complex(trigram_vectorizer, test_df.iloc[0][\"text\"], small_model)\n",
    "print(keywords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61e87a9d-c4f8-4184-a0fb-79c073394424",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
